In [1]:
#IMPORT THE LIBRARIES
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
import IPython.display as ipd
from IPython.display import Audio
import keras
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM,BatchNormalization , GRU
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from keras.layers import Input, Flatten, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD



import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning)
In [2]:
# path to the directory
RAVD = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
In [3]:
dirl_list = os.listdir(RAVD)
dirl_list.sort()

emotion = []
gender = []
path = []
for i in dirl_list:
    fname = os.listdir(RAVD + i)
    for f in fname:
        part = f.split('.')[0].split('-')
        emotion.append(int(part[2]))
        temp = int(part[6])
        if temp%2 == 0:
            temp = "female"
        else:
            temp = "male"
        gender.append(temp)
        path.append(RAVD + i + '/' + f)

        
RAVD_df = pd.DataFrame(emotion)
RAVD_df = RAVD_df.replace({1:'neutral', 2:'neutral', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
RAVD_df = pd.concat([pd.DataFrame(gender),RAVD_df],axis=1)
RAVD_df.columns = ['gender','emotion']
RAVD_df['labels'] =RAVD_df.gender + '_' + RAVD_df.emotion
RAVD_df['source'] = 'RAVDESS'  
RAVD_df = pd.concat([RAVD_df,pd.DataFrame(path, columns = ['path'])],axis=1)
RAVD_df = RAVD_df.drop(['gender', 'emotion'], axis=1)
RAVD_df.labels.value_counts()
Out[3]:
male_neutral       144
female_neutral     144
male_sad            96
male_fear           96
male_happy          96
male_disgust        96
male_angry          96
male_surprise       96
female_surprise     96
female_disgust      96
female_fear         96
female_sad          96
female_happy        96
female_angry        96
Name: labels, dtype: int64
In [4]:
plt.figure(figsize=(12, 5))
plt.title('Count of Emotions', size=16)
sns.countplot(RAVD_df.labels)
plt.ylabel('Count', size=12)
plt.xlabel('Emotions', size=12)
plt.xticks(rotation=45)
sns.despine(top=True, right=True, left=False, bottom=False)
plt.show()
No description has been provided for this image
In [5]:
# Female Happy
fRA1= RAVD + 'Actor_08/03-01-03-02-02-01-08.wav'
data, sr = librosa.load(fRA1)
ipd.Audio(fRA1) 
Out[5]:
Your browser does not support the audio element.
In [6]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000) 
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Happy')
plt.colorbar(format='%+2.0f dB')
Out[6]:
<matplotlib.colorbar.Colorbar at 0x797a7a5f9b50>
No description has been provided for this image
In [7]:
#Female Fear
fRA2=RAVD +'Actor_08/03-01-06-01-01-01-08.wav'
data, sr = librosa.load(fRA2)
ipd.Audio(fRA2) 
Out[7]:
Your browser does not support the audio element.
In [8]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000) 
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Fear')
plt.colorbar(format='%+2.0f dB');
No description has been provided for this image

Next, we compare the waveplots of happy and fearful tracks

In [9]:
# Female Disgust
fRA1 =RAVD +'Actor_20/03-01-08-02-02-02-20.wav'
data, sr = librosa.load(fRA1)
ipd.Audio(fRA1) 
Out[9]:
Your browser does not support the audio element.
In [10]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000) 
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Female Disgust')
plt.colorbar(format='%+2.0f dB');
No description has been provided for this image
In [11]:
# Male Fearfull
fRA1 = RAVD + 'Actor_19/03-01-04-01-02-01-19.wav'
data, sr = librosa.load(fRA1)
ipd.Audio(fRA1) 
Out[11]:
Your browser does not support the audio element.
In [12]:
# CREATE LOG MEL SPECTROGRAM
plt.figure(figsize=(10, 5))
spectrogram = librosa.feature.melspectrogram(y=data, sr=sr, n_mels=128,fmax=8000) 
spectrogram = librosa.power_to_db(spectrogram)
librosa.display.specshow(spectrogram, y_axis='mel', fmax=8000, x_axis='time');
plt.title('Mel Spectrogram - Male Fearfull')
plt.colorbar(format='%+2.0f dB');
No description has been provided for this image
In [13]:
# Gender - Female; Emotion - Angry 
path = "../input/ravdess-emotional-speech-audio/Actor_18/03-01-05-01-01-01-18.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)


# MFCC
plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(path)
Out[13]:
Your browser does not support the audio element.
No description has been provided for this image
In [14]:
# Gender - Male; Emotion - Angry 
path = "../input/ravdess-emotional-speech-audio/Actor_17/03-01-05-01-01-02-17.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
mfcc = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)


# MFCC
plt.figure(figsize=(16, 10))
plt.subplot(3,1,1)
librosa.display.specshow(mfcc, x_axis='time')
plt.ylabel('MFCC')
plt.colorbar()

ipd.Audio(path)
Out[14]:
Your browser does not support the audio element.
No description has been provided for this image
In [15]:
# Gender - Female; Emotion - angry
path = "../input/ravdess-emotional-speech-audio/Actor_18/03-01-05-01-01-01-18.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))

# Gender - Male; Emotion - angry
path = "../input/ravdess-emotional-speech-audio/Actor_17/03-01-05-01-01-02-17.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))

# Plot the two audio waves together
plt.figure(figsize=(16,10))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
216
216
Out[15]:
<matplotlib.legend.Legend at 0x797a790ea490>
No description has been provided for this image
In [16]:
# Gender - Female; Emotion - Surprised
path = "../input/ravdess-emotional-speech-audio/Actor_20/03-01-08-02-01-02-20.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
female = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
female = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(female))

# Gender - Male; Emotion - Surprised
path = "../input/ravdess-emotional-speech-audio/Actor_21/03-01-08-02-01-01-21.wav"
X, sample_rate = librosa.load(path, res_type='kaiser_fast',duration=2.5,sr=22050*2,offset=0.5)  
male = librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13)
male = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=13), axis=0)
print(len(male))

# Plot the two audio waves together
plt.figure(figsize=(16,10))
plt.subplot(3,1,1)
plt.plot(female, label='female')
plt.plot(male, label='male')
plt.legend()
216
216
Out[16]:
<matplotlib.legend.Legend at 0x797a790bead0>
No description has been provided for this image
In [17]:
# NOISE
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data
# STRETCH
def stretch(data, rate=0.8):
    return librosa.effects.time_stretch(data, rate)
# SHIFT
def shift(data):
    shift_range = int(np.random.uniform(low=-5, high = 5)*1000)
    return np.roll(data, shift_range)
# PITCH
def pitch(data, sampling_rate, pitch_factor=0.7):
    return librosa.effects.pitch_shift(data, sampling_rate, pitch_factor)
In [18]:
# Trying different functions above
path = np.array(RAVD_df['path'])[471]
data, sample_rate = librosa.load(path)
In [19]:
# NORMAL AUDIO


import librosa.display
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=data, sr=sample_rate)
Audio(path)
Out[19]:
Your browser does not support the audio element.
No description has been provided for this image
In [20]:
# AUDIO WITH NOISE
x = noise(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[20]:
Your browser does not support the audio element.
No description has been provided for this image
In [21]:
# STRETCHED AUDIO
x = stretch(data)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[21]:
Your browser does not support the audio element.
No description has been provided for this image
In [22]:
# SHIFTED AUDIO
x = shift(data)
plt.figure(figsize=(12,5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[22]:
Your browser does not support the audio element.
No description has been provided for this image
In [23]:
# AUDIO WITH PITCH
x = pitch(data, sample_rate)
plt.figure(figsize=(12, 5))
librosa.display.waveshow(y=x, sr=sample_rate)
Audio(x, rate=sample_rate)
Out[23]:
Your browser does not support the audio element.
No description has been provided for this image
In [24]:
def feat_ext(data):
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    return mfcc

def get_feat(path):
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    # normal data
    res1 = feat_ext(data)
    result = np.array(res1)
    #data with noise
    noise_data = noise(data)
    res2 = feat_ext(noise_data)
    result = np.vstack((result, res2))
    #data with stretch and pitch
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    res3 = feat_ext(data_stretch_pitch)
    result = np.vstack((result, res3))
    return result
In [25]:
RAVD_df.head()
Out[25]:
labels source path
0 male_neutral RAVDESS /kaggle/input/ravdess-emotional-speech-audio/a...
1 male_neutral RAVDESS /kaggle/input/ravdess-emotional-speech-audio/a...
2 male_sad RAVDESS /kaggle/input/ravdess-emotional-speech-audio/a...
3 male_neutral RAVDESS /kaggle/input/ravdess-emotional-speech-audio/a...
4 male_neutral RAVDESS /kaggle/input/ravdess-emotional-speech-audio/a...
In [26]:
X, Y = [], []
for path, emotion in zip(RAVD_df['path'], RAVD_df['labels']):
    feature = get_feat(path)
    for ele in feature:
        X.append(ele)
        Y.append(emotion)
In [27]:
Emotions = pd.DataFrame(X)
Emotions['labels'] = Y
Emotions.to_csv('emotion.csv', index=False)
Emotions.head()
Out[27]:
0 1 2 3 4 5 6 7 8 9 ... 11 12 13 14 15 16 17 18 19 labels
0 -626.266724 93.891739 -0.696724 17.828402 9.496767 2.025836 -2.726057 -8.519138 -12.432029 -6.580182 ... -2.754473 0.774303 -5.368925 -0.340401 1.479823 -8.706111 -2.767464 -1.620493 -1.525633 male_neutral
1 -424.219659 33.409893 13.512744 9.723939 4.621845 1.812571 -1.768389 -6.672808 -7.872926 -5.476621 ... -0.298616 -1.024360 -1.842204 -0.319954 -0.260445 -4.467974 -3.670608 0.103759 -2.584844 male_neutral
2 -680.248840 90.474678 -2.995482 17.773315 6.315861 0.721663 -6.446163 -11.472776 -14.421964 -4.905107 ... -1.993616 -0.631891 -6.631033 1.132353 -2.568039 -8.887710 -1.243952 -2.682266 -6.088979 male_neutral
3 -634.959839 72.811478 -3.487027 20.697269 10.188320 -0.667840 -3.293633 -7.447816 -16.703850 -2.161060 ... -3.714514 0.273592 -4.517450 -1.117245 0.534381 -6.885534 -1.295200 -3.289555 0.756877 male_neutral
4 -453.426387 31.440529 8.427989 11.046314 5.784891 -1.362626 -2.919144 -6.799702 -9.867522 -5.229569 ... -1.979740 -1.493574 -2.101019 -1.031402 -0.849473 -4.312014 -3.388788 -0.723249 -1.842601 male_neutral

5 rows × 21 columns

In [28]:
# can use this directly from saved feature .csv file
Emotions = pd.read_csv('./emotion.csv')
Emotions.head()
Out[28]:
0 1 2 3 4 5 6 7 8 9 ... 11 12 13 14 15 16 17 18 19 labels
0 -626.266724 93.891739 -0.696724 17.828402 9.496767 2.025836 -2.726057 -8.519138 -12.432029 -6.580182 ... -2.754473 0.774303 -5.368925 -0.340401 1.479823 -8.706111 -2.767464 -1.620493 -1.525633 male_neutral
1 -424.219659 33.409893 13.512744 9.723939 4.621845 1.812571 -1.768389 -6.672808 -7.872926 -5.476621 ... -0.298616 -1.024360 -1.842204 -0.319954 -0.260445 -4.467974 -3.670608 0.103759 -2.584844 male_neutral
2 -680.248840 90.474678 -2.995482 17.773315 6.315861 0.721663 -6.446163 -11.472776 -14.421964 -4.905107 ... -1.993616 -0.631891 -6.631033 1.132353 -2.568039 -8.887710 -1.243952 -2.682266 -6.088979 male_neutral
3 -634.959839 72.811478 -3.487027 20.697269 10.188320 -0.667840 -3.293633 -7.447816 -16.703850 -2.161060 ... -3.714514 0.273592 -4.517450 -1.117245 0.534381 -6.885534 -1.295200 -3.289555 0.756877 male_neutral
4 -453.426387 31.440529 8.427989 11.046314 5.784891 -1.362626 -2.919144 -6.799702 -9.867522 -5.229569 ... -1.979740 -1.493574 -2.101019 -1.031402 -0.849473 -4.312014 -3.388788 -0.723249 -1.842601 male_neutral

5 rows × 21 columns

In [29]:
X = Emotions.iloc[: ,:-1].values
Y = Emotions['labels'].values
In [30]:
# As this is a multiclass classification problem onehotencoding our Y
encoder = OneHotEncoder()
Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
In [31]:
# Train and Test Split 
x_train, x_test, y_train, y_test = train_test_split(X, Y, random_state=0, shuffle=True)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
Out[31]:
((3240, 20), (3240, 14), (1080, 20), (1080, 14))
In [32]:
# Reshape for LSTM 
X_train = x_train.reshape(x_train.shape[0] , x_train.shape[1] , 1)
X_test = x_test.reshape(x_test.shape[0] , x_test.shape[1] , 1)
In [33]:
# scaling our data with sklearn's Standard scaler
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
x_train.shape, y_train.shape, x_test.shape, y_test.shape
Out[33]:
((3240, 20), (3240, 14), (1080, 20), (1080, 14))
In [42]:
# Additional imports for advanced feature extraction
import scipy
from scipy.fftpack import dct
import scipy.signal as signal
from scipy.fftpack import dct

def extract_plp(audio, sr, n_coeff=13):
    """
    Extract Perceptual Linear Prediction coefficients
    PLP is based on human auditory perception characteristics
    """
    # Pre-emphasis filter
    pre_emphasis = 0.97
    emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
    
    # Frame the signal
    frame_size = 0.025
    frame_stride = 0.01
    frame_length = int(round(frame_size * sr))
    frame_step = int(round(frame_stride * sr))
    
    signal_length = len(emphasized_audio)
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
    
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_audio, z)
    
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
              np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    
    # Apply hamming window
    frames *= np.hamming(frame_length)
    
    # Power spectrum
    NFFT = 512
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
    
    # Mel scale filter banks
    nfilt = 26
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
    hz_points = (700 * (10**(mel_points / 2595) - 1))
    
    bin_points = np.floor((NFFT + 1) * hz_points / sr)
    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin_points[m - 1])
        f_m = int(bin_points[m])
        f_m_plus = int(bin_points[m + 1])
        
        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
    
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
    filter_banks = np.log(filter_banks)
    
    # Apply DCT to get PLP coefficients
    plp = dct(filter_banks, type=2, axis=1, norm='ortho')[:, :n_coeff]
    
    return np.mean(plp, axis=0)
In [43]:
def extract_lpcc(audio, sr, n_coeff=13):
    """
    Extract Linear Prediction Cepstral Coefficients
    LPCC captures the spectral envelope of the speech signal
    """
    # Pre-emphasis
    pre_emphasis = 0.97
    emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
    
    # Framing
    frame_size = 0.025
    frame_stride = 0.01
    frame_length = int(round(frame_size * sr))
    frame_step = int(round(frame_stride * sr))
    
    signal_length = len(emphasized_audio)
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
    
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_audio, z)
    
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
              np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    
    # Apply window
    frames *= np.hamming(frame_length)
    
    # LPC analysis
    lpc_order = n_coeff
    lpcc_features = []
    
    for frame in frames:
        # Calculate autocorrelation
        autocorr = np.correlate(frame, frame, mode='full')
        autocorr = autocorr[len(autocorr)//2:]
        
        # Levinson-Durbin recursion for LPC coefficients
        if len(autocorr) > lpc_order and autocorr[0] != 0:
            lpc_coeffs = np.zeros(lpc_order + 1)
            lpc_coeffs[0] = 1.0
            
            error = autocorr[0]
            
            for i in range(1, lpc_order + 1):
                lambda_val = -np.sum(lpc_coeffs[:i] * autocorr[i:0:-1]) / error
                lpc_coeffs[1:i+1] += lambda_val * lpc_coeffs[i-1::-1]
                lpc_coeffs[i] = lambda_val
                error *= (1 - lambda_val**2)
            
            # Convert LPC to LPCC
            lpcc = np.zeros(n_coeff)
            lpcc[0] = -lpc_coeffs[1]
            
            for n in range(2, n_coeff + 1):
                sum_val = 0
                for k in range(1, n):
                    sum_val += (k / n) * lpcc[k-1] * lpc_coeffs[n-k]
                lpcc[n-1] = -lpc_coeffs[n] - sum_val
            
            lpcc_features.append(lpcc)
    
    if len(lpcc_features) > 0:
        return np.mean(lpcc_features, axis=0)
    else:
        return np.zeros(n_coeff)
In [44]:
def extract_gfcc(audio, sr, n_coeff=13):
    """
    Extract Gammatone Frequency Cepstral Coefficients
    GFCC mimics the human auditory system more closely than MFCC
    """
    # Parameters
    frame_size = 0.025
    frame_stride = 0.01
    frame_length = int(round(frame_size * sr))
    frame_step = int(round(frame_stride * sr))
    NFFT = 512
    
    # Pre-emphasis
    pre_emphasis = 0.97
    emphasized_audio = np.append(audio[0], audio[1:] - pre_emphasis * audio[:-1])
    
    # Framing
    signal_length = len(emphasized_audio)
    num_frames = int(np.ceil(float(np.abs(signal_length - frame_length)) / frame_step))
    
    pad_signal_length = num_frames * frame_step + frame_length
    z = np.zeros((pad_signal_length - signal_length))
    pad_signal = np.append(emphasized_audio, z)
    
    indices = np.tile(np.arange(0, frame_length), (num_frames, 1)) + \
              np.tile(np.arange(0, num_frames * frame_step, frame_step), (frame_length, 1)).T
    frames = pad_signal[indices.astype(np.int32, copy=False)]
    
    # Window
    frames *= np.hamming(frame_length)
    
    # Power spectrum
    mag_frames = np.absolute(np.fft.rfft(frames, NFFT))
    pow_frames = ((1.0 / NFFT) * ((mag_frames) ** 2))
    
    # Gammatone-like filter bank (ERB scale approximation)
    nfilt = 26
    low_freq_mel = 0
    high_freq_mel = (2595 * np.log10(1 + (sr / 2) / 700))
    mel_points = np.linspace(low_freq_mel, high_freq_mel, nfilt + 2)
    hz_points = (700 * (10**(mel_points / 2595) - 1))
    
    bin_points = np.floor((NFFT + 1) * hz_points / sr)
    fbank = np.zeros((nfilt, int(np.floor(NFFT / 2 + 1))))
    
    for m in range(1, nfilt + 1):
        f_m_minus = int(bin_points[m - 1])
        f_m = int(bin_points[m])
        f_m_plus = int(bin_points[m + 1])
        
        for k in range(f_m_minus, f_m):
            fbank[m - 1, k] = (k - bin_points[m - 1]) / (bin_points[m] - bin_points[m - 1])
        for k in range(f_m, f_m_plus):
            fbank[m - 1, k] = (bin_points[m + 1] - k) / (bin_points[m + 1] - bin_points[m])
    
    filter_banks = np.dot(pow_frames, fbank.T)
    filter_banks = np.where(filter_banks == 0, np.finfo(float).eps, filter_banks)
    
    # Power law compression (mimics cochlear compression)
    filter_banks = np.power(filter_banks, 0.33)
    filter_banks = np.log(filter_banks + 1e-8)
    
    # DCT for GFCC
    gfcc = dct(filter_banks, type=2, axis=1, norm='ortho')[:, :n_coeff]
    
    return np.mean(gfcc, axis=0)
In [45]:
def extract_all_features(path, feature_type='mfcc'):
    """
    Extract features based on specified type with augmentation
    Feature types: 'mfcc', 'plp', 'lpcc', 'gfcc'
    """
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    
    # Extract based on feature type
    if feature_type == 'mfcc':
        res1 = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate, n_mfcc=20).T, axis=0)
    elif feature_type == 'plp':
        res1 = extract_plp(data, sample_rate, n_coeff=20)
    elif feature_type == 'lpcc':
        res1 = extract_lpcc(data, sample_rate, n_coeff=20)
    elif feature_type == 'gfcc':
        res1 = extract_gfcc(data, sample_rate, n_coeff=20)
    
    result = np.array(res1)
    
    # Augmentation with noise
    noise_data = noise(data)
    if feature_type == 'mfcc':
        res2 = np.mean(librosa.feature.mfcc(y=noise_data, sr=sample_rate, n_mfcc=20).T, axis=0)
    elif feature_type == 'plp':
        res2 = extract_plp(noise_data, sample_rate, n_coeff=20)
    elif feature_type == 'lpcc':
        res2 = extract_lpcc(noise_data, sample_rate, n_coeff=20)
    elif feature_type == 'gfcc':
        res2 = extract_gfcc(noise_data, sample_rate, n_coeff=20)
    result = np.vstack((result, res2))
    
    # Augmentation with stretch and pitch
    new_data = stretch(data)
    data_stretch_pitch = pitch(new_data, sample_rate)
    if feature_type == 'mfcc':
        res3 = np.mean(librosa.feature.mfcc(y=data_stretch_pitch, sr=sample_rate, n_mfcc=20).T, axis=0)
    elif feature_type == 'plp':
        res3 = extract_plp(data_stretch_pitch, sample_rate, n_coeff=20)
    elif feature_type == 'lpcc':
        res3 = extract_lpcc(data_stretch_pitch, sample_rate, n_coeff=20)
    elif feature_type == 'gfcc':
        res3 = extract_gfcc(data_stretch_pitch, sample_rate, n_coeff=20)
    result = np.vstack((result, res3))
    
    return result
In [46]:
# Dictionary to store features for each type
feature_sets = {}
feature_types = ['mfcc', 'plp', 'lpcc', 'gfcc']

for feat_type in feature_types:
    print(f"Extracting {feat_type.upper()} features...")
    X_feat, Y_feat = [], []
    
    for path, emotion in zip(RAVD_df['path'], RAVD_df['labels']):
        feature = extract_all_features(path, feature_type=feat_type)
        for ele in feature:
            X_feat.append(ele)
            Y_feat.append(emotion)
    
    feature_sets[feat_type] = {'X': np.array(X_feat), 'Y': np.array(Y_feat)}
    print(f"{feat_type.upper()}: Shape {feature_sets[feat_type]['X'].shape}")
Extracting MFCC features...
MFCC: Shape (4320, 20)
Extracting PLP features...
PLP: Shape (4320, 20)
Extracting LPCC features...
LPCC: Shape (4320, 20)
Extracting GFCC features...
GFCC: Shape (4320, 20)
In [48]:
def build_model(input_shape, num_classes):
    """
    Build CNN-LSTM hybrid model for emotion classification
    """
    model = Sequential()
    
    # CNN layers
    model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    model.add(Conv1D(128, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=2, strides=2, padding='same'))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))
    
    # LSTM layers
    model.add(LSTM(128, return_sequences=True))
    model.add(Dropout(0.3))
    
    model.add(LSTM(64))
    model.add(Dropout(0.3))
    
    # Dense layers
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    
    model.add(Dense(num_classes, activation='softmax'))
    
    model.compile(loss='categorical_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
    
    return model
In [49]:
results = {}

for feat_type in feature_types:
    print(f"\n{'='*60}")
    print(f"Training model with {feat_type.upper()} features")
    print(f"{'='*60}")
    
    # Prepare data
    X = feature_sets[feat_type]['X']
    Y = feature_sets[feat_type]['Y']
    
    # One-hot encode labels
    encoder = OneHotEncoder()
    Y_encoded = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
    
    # Train-test split
    x_train, x_test, y_train, y_test = train_test_split(X, Y_encoded, 
                                                          test_size=0.25, 
                                                          random_state=42, 
                                                          shuffle=True)
    
    # Scale data
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(x_train)
    x_test_scaled = scaler.transform(x_test)
    
    # Reshape for CNN-LSTM
    x_train_reshaped = x_train_scaled.reshape(x_train_scaled.shape[0], x_train_scaled.shape[1], 1)
    x_test_reshaped = x_test_scaled.reshape(x_test_scaled.shape[0], x_test_scaled.shape[1], 1)
    
    print(f"Train shape: {x_train_reshaped.shape}, Test shape: {x_test_reshaped.shape}")
    
    # Build model
    model = build_model((x_train_reshaped.shape[1], 1), y_train.shape[1])
    
    # Train model
    history = model.fit(x_train_reshaped, y_train, 
                        batch_size=64, 
                        epochs=100, 
                        validation_data=(x_test_reshaped, y_test),
                        verbose=0)
    
    # Evaluate
    test_loss, test_accuracy = model.evaluate(x_test_reshaped, y_test, verbose=0)
    
    # Store results
    results[feat_type] = {
        'model': model,
        'history': history,
        'test_accuracy': test_accuracy,
        'test_loss': test_loss,
        'x_test': x_test_reshaped,
        'y_test': y_test,
        'scaler': scaler,
        'encoder': encoder
    }
    
    print(f"Test Accuracy: {test_accuracy*100:.2f}%")
    print(f"Test Loss: {test_loss:.4f}")
============================================================
Training model with MFCC features
============================================================
Train shape: (3240, 20, 1), Test shape: (1080, 20, 1)
Test Accuracy: 82.96%
Test Loss: 0.7929

============================================================
Training model with PLP features
============================================================
Train shape: (3240, 20, 1), Test shape: (1080, 20, 1)
Test Accuracy: 79.54%
Test Loss: 0.8523

============================================================
Training model with LPCC features
============================================================
Train shape: (3240, 20, 1), Test shape: (1080, 20, 1)
Test Accuracy: 68.70%
Test Loss: 1.3467

============================================================
Training model with GFCC features
============================================================
Train shape: (3240, 20, 1), Test shape: (1080, 20, 1)
Test Accuracy: 81.48%
Test Loss: 0.7513
In [50]:
comparison_data = []
for feat_type in feature_types:
    comparison_data.append({
        'Feature Type': feat_type.upper(),
        'Test Accuracy (%)': round(results[feat_type]['test_accuracy'] * 100, 2),
        'Test Loss': round(results[feat_type]['test_loss'], 4)
    })

comparison_df = pd.DataFrame(comparison_data)
print("\nPerformance Comparison:")
print(comparison_df.to_string(index=False))
Performance Comparison:
Feature Type  Test Accuracy (%)  Test Loss
        MFCC              82.96     0.7929
         PLP              79.54     0.8523
        LPCC              68.70     1.3467
        GFCC              81.48     0.7513
In [51]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, feat_type in enumerate(feature_types):
    history = results[feat_type]['history']
    
    axes[idx].plot(history.history['accuracy'], label='Train Accuracy', linewidth=2)
    axes[idx].plot(history.history['val_accuracy'], label='Validation Accuracy', linewidth=2)
    axes[idx].set_title(f'{feat_type.upper()} - Accuracy', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Epoch')
    axes[idx].set_ylabel('Accuracy')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [52]:
fig, axes = plt.subplots(2, 2, figsize=(15, 10))
axes = axes.ravel()

for idx, feat_type in enumerate(feature_types):
    history = results[feat_type]['history']
    
    axes[idx].plot(history.history['loss'], label='Train Loss', linewidth=2)
    axes[idx].plot(history.history['val_loss'], label='Validation Loss', linewidth=2)
    axes[idx].set_title(f'{feat_type.upper()} - Loss', fontsize=12, fontweight='bold')
    axes[idx].set_xlabel('Epoch')
    axes[idx].set_ylabel('Loss')
    axes[idx].legend()
    axes[idx].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()
No description has been provided for this image
In [53]:
# Find best performing feature
best_feat = max(results.items(), key=lambda x: x[1]['test_accuracy'])
best_feat_type = best_feat[0]
best_model = best_feat[1]['model']

print(f"Best Feature: {best_feat_type.upper()}")
print(f"Accuracy: {best_feat[1]['test_accuracy']*100:.2f}%\n")

# Predictions
y_pred = best_model.predict(results[best_feat_type]['x_test'], verbose=0)
y_pred_classes = np.argmax(y_pred, axis=1)
y_true_classes = np.argmax(results[best_feat_type]['y_test'], axis=1)

# Confusion matrix
cm = confusion_matrix(y_true_classes, y_pred_classes)

plt.figure(figsize=(12, 10))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', cbar=True)
plt.title(f'Confusion Matrix - {best_feat_type.upper()}', fontsize=14, fontweight='bold')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()
Best Feature: MFCC
Accuracy: 82.96%

No description has been provided for this image
In [54]:
emotion_labels = np.unique(RAVD_df['labels'])
print(f"Classification Report for {best_feat_type.upper()}:\n")
print(classification_report(y_true_classes, y_pred_classes, 
                            target_names=emotion_labels))
Classification Report for MFCC:

                 precision    recall  f1-score   support

   female_angry       0.93      0.79      0.85        84
 female_disgust       0.85      0.79      0.82        71
    female_fear       0.88      0.85      0.86        78
   female_happy       0.78      0.87      0.82        67
 female_neutral       0.78      0.96      0.86        96
     female_sad       0.84      0.69      0.76        74
female_surprise       0.85      0.91      0.88        64
     male_angry       0.85      0.93      0.89        74
   male_disgust       0.91      0.63      0.74        78
      male_fear       0.81      0.80      0.81        70
     male_happy       0.82      0.77      0.79        75
   male_neutral       0.78      0.94      0.85       109
       male_sad       0.77      0.76      0.76        70
  male_surprise       0.85      0.89      0.87        70

       accuracy                           0.83      1080
      macro avg       0.84      0.83      0.83      1080
   weighted avg       0.83      0.83      0.83      1080

In [55]:
plt.figure(figsize=(10, 6))
features = [f.upper() for f in feature_types]
accuracies = [results[f]['test_accuracy']*100 for f in feature_types]

bars = plt.bar(features, accuracies, color=['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728'])
plt.xlabel('Feature Type', fontsize=12, fontweight='bold')
plt.ylabel('Test Accuracy (%)', fontsize=12, fontweight='bold')
plt.title('Model Performance Across Different Features', fontsize=14, fontweight='bold')
plt.ylim([0, 100])

for bar in bars:
    height = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.2f}%', ha='center', va='bottom', fontsize=10, fontweight='bold')

plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [56]:
# Save the best model
model_name = f'best_model_{best_feat_type}.keras'
save_dir = os.path.join(os.getcwd(), 'saved_models')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
    
model_path = os.path.join(save_dir, model_name)
best_model.save(model_path)
print(f'Saved best model ({best_feat_type.upper()}) at {model_path}')
Saved best model (MFCC) at /kaggle/working/saved_models/best_model_mfcc.keras
In [57]:
print("\n" + "="*60)
print("SUMMARY OF RESULTS")
print("="*60)

for feat_type in feature_types:
    acc = results[feat_type]['test_accuracy'] * 100
    loss = results[feat_type]['test_loss']
    print(f"{feat_type.upper():12s} - Accuracy: {acc:6.2f}% | Loss: {loss:.4f}")

print("="*60)
print(f"Best Model: {best_feat_type.upper()} with {best_feat[1]['test_accuracy']*100:.2f}% accuracy")
print("="*60)
============================================================
SUMMARY OF RESULTS
============================================================
MFCC         - Accuracy:  82.96% | Loss: 0.7929
PLP          - Accuracy:  79.54% | Loss: 0.8523
LPCC         - Accuracy:  68.70% | Loss: 1.3467
GFCC         - Accuracy:  81.48% | Loss: 0.7513
============================================================
Best Model: MFCC with 82.96% accuracy
============================================================
In [ ]: